R Code for Lecture 22 (Wednesday, November 7, 2012)


## generate goodness of fit graph from last time ###

 gala <- read.table('ecol 563/galapagos.txt', header=T)
# fit NB-2 model
library(MASS)
out.NB2 <- glm.nb(Species~log(Area), data=gala)
gala$mu <- fitted(out.NB2)
gala$z <- dnbinom(gala$Species, mu=fitted(out.NB2), size=out.NB2$theta)
out.p <- lapply(1:29, function(x) dnbinom(0:1000, mu=fitted(out.NB2)[x], size=out.NB2$theta))
gala$ux <- sapply(1:29, function(x) max((0:1000)[out.p[[x]]>1e-5]))
gala$lx <- rep(0, 29)
gala$uy <- sapply(out.p, max)
gala$ly <- rep(0, 29)
library(lattice)
prepanel.ci2 <- function(x, y, ly, lx, uy, ux, subscripts, ...) {
list(ylim=range(y, uy[subscripts], ly[subscripts], finite=T), xlim=range(x, ux[subscripts], lx[subscripts], finite=T))}
xyplot(z~Species|Island, data=gala, ux=gala$ux, lx=gala$lx, uy=gala$uy, ly=gala$ly, prepanel=prepanel.ci2, ylab='Probability', xlab='Species richness',  panel=function(x, y, subscripts){
panel.xyplot(0:1000, dnbinom(0:1000, mu=gala$mu[subscripts], size=out.NB2$theta), type='h', col='grey')
panel.abline(v=x, col=2, lty=2)},
scale=list(x='free', y='free'), strip=strip.custom(par.strip.text=list(cex=0.8)))

# there are 29 panels
# we can spready panels over multiple pages by using the layout argument
# to plot fewer than 29 panels on a page

xyplot(z~Species|Island, data=gala, ux=gala$ux, lx=gala$lx, uy=gala$uy, ly=gala$ly, prepanel=prepanel.ci2, ylab='Probability', xlab='Species richness',  layout=c(4,4), panel=function(x, y, subscripts){
panel.xyplot(0:1000, dnbinom(0:1000, mu=gala$mu[subscripts], size=out.NB2$theta), type='h', col='grey')
panel.abline(v=x, col=2, lty=2)},
scale=list(x='free', y='free'), strip=strip.custom(par.strip.text=list(cex=0.8)))

# we can pause the display of graphs before each page
par(ask=T)
xyplot(z~Species|Island, data=gala, ux=gala$ux, lx=gala$lx, uy=gala$uy, ly=gala$ly, prepanel=prepanel.ci2, ylab='Probability', xlab='Species richness',  layout=c(4,4), panel=function(x, y, subscripts) {
panel.xyplot(0:1000, dnbinom(0:1000, mu=gala$mu[subscripts], size=out.NB2$theta), type='h', col='grey')
panel.abline(v=x, col=2, lty=2)},
scale=list(x='free', y='free'), strip=strip.custom(par.strip.text=list(cex=0.8)))

# reset ask parameter to its default setting
par(ask=F)

# residual deviance can be a goodness of fit statistic
out.pois <- glm(Species~log(Area), data=gala, family=poisson)
summary(out.pois)

# the deviance residuals
residuals(out.pois, type='deviance')
# sum of the squared deviance residuals is the deviance
sum(residuals(out.pois, type='deviance')^2)
# Pearson residuals
residuals(out.pois, type='pearson')
# sum of the squared Pearson residuals is the Pearson deviance
sum(residuals(out.pois, type='pearson')^2)

# to use the residual deviance as a goodness of fit statistic
# the expected counts need to be sufficiently large
fitted(out.pois)
sum(fitted(out.pois)<5)

# carry out goodness of fit test
summary(out.pois)
1-pchisq(649.22,27)
# overdispersion if ratio is much larger than 1
649.22/27
# technically we should use the Pearson deviance for a Poisson model
# but the difference is usually trivial

# although the residual deviance is reported for an NB model
# it is not a goodness of fit statistic
out.NB2 <- glm.nb(Species~log(Area), data=gala)
summary(out.NB2)
# invalid test
1-pchisq(31.295,27)


#### Jamaican birds data set ####
birds <- read.csv('ecol 563/birds.csv', header=T)
birds[1:8,]

# missing values in response yield missing means
tapply(birds$S,birds$landscape,mean)

# obtain mean richness by landscape 
tapply(birds$S,birds$landscape,mean,na.rm=T)
# obtain mean richness by year by landscape
tapply(birds$S,list(birds$year,birds$landscape), mean, na.rm=T)

# include year and landscape as predictors
out.S <- glm(S ~ factor(year)+landscape, data=birds, family=poisson)
summary(out.S)
anova(out.S, test='Chisq')
summary(out.S)

# all predicted counts exceeed 5: deviance GoF test is OK
sum(fitted(out.S)<5)
# significant lack of fit--we need more predictors to account for overdispersion
1-pchisq(532.25,251)

# plot of S versus area indicates log linearizes the relationship
plot(S~area, data=birds)
plot(S~log(area), data=birds)
out.S1 <- glm(S ~ factor(year)+landscape+log(area), data=birds, family=poisson)

# compare models with and without log(Area)
AIC(out.S,out.S1)
anova(out.S1,test='Chisq')
summary(out.S1)

# deviance goodness of fit test is now almost adequate
1-pchisq(297.53,250)

# there are two ways to include an offset term
out.S2 <- glm(S ~ factor(year)+landscape+offset(log(area)), data=birds, family=poisson)
out.S3 <- glm(S ~ factor(year)+landscape, offset=log(area), data=birds, family=poisson)

# the offset term does not appear in output
summary(out.S2)
# but estimates have been adjusted
coef(out.S2)
coef(out.S)
coef(out.S1)

# examine 95% confidence interval for log(area): does it include 1?
summary(out.S1)

# compare AIC of oriinal model, ancova model, and offset model
AIC(out.S, out.S1, out.S2)

# number of repeated measures per patch
table(birds$patch[!is.na(birds$S)])

library(lme4)
# fit Poisson model with patch random effects
out.lmer <- lmer(S~factor(year)+landscape+log(area) + (1|patch), data=birds, family=poisson)

# compare estimates
summary(out.lmer)
coef(out.S1)

# AIC and log-likelihood returned by lmer
# not comparable to that returned by glm
AIC(out.lmer)
AIC(out.S1)
logLik(out.S1)
logLik(out.lmer)


## function to calculate marginal log-likelihood of Poisson random intercepts model

poisloglike.log<-function(int.mod, mult=1) {
# int.mod is the Poisson model random intercepts model fit by lmer
# mult is a large multiplier, e.g., 1e40, 1e80, 1e120, etc.
# used to improve the accuracy of the numerical integration
####### extract model components #######
#level-1 variance
s <- attr(VarCorr(int.mod)[[1]],"stddev")
#Poisson mean
etavec<-int.mod@X %*% fixef(int.mod)
#check if offset was used
if(length(int.mod@offset)>0) etavec <- etavec+int.mod@offset
#response variable
y <- int.mod@y
#split the response and Poisson mean by group
temp.dat <- data.frame(y,etavec)
split.by.group <- split(temp.dat, int.mod@flist)
#function to evaluate the integrated likelihood for a group
pois.component <- function(xvec) {
#likelihood term for the first element in the group
base.eta <- paste("exp(", xvec[1,2], "+x)", sep='')
base.pois <- paste("dpois(", xvec[1,1], ",", base.eta,")", sep='')
#add more terms to the product if the group size is > 1
if(nrow(xvec)>1)
for(i in 2:nrow(xvec)) {
cur.eta <- paste("exp(", xvec[i,2], "+x)", sep='')
cur.pois <- paste("dpois(", xvec[i,1], ",", cur.eta,")", sep='')
base.pois <- paste(base.pois, cur.pois, sep='*')
}
#construct the integral as a text expression
int.call <- paste("integrate(function(x) ", base.pois, "*dnorm(x,0,",s,")*mult, -Inf, Inf, rel.tol = 1e-12)", sep='')
#calculate integral
eval(parse(text=int.call))
}
#obtain likelihood terms by group
sapply(split.by.group,pois.component) -> like.terms
#log-likelihood
sum(log(unlist(like.terms[1,])/mult))
}

#### end function ####

# log-likelihood of random effects model
poisloglike.log(out.lmer)
# log-likelihood of model without random effects
logLik(out.S1)

# number of estimated parameters
attr(logLik(out.lmer),"df")

# calculate AIC of random intercepts model
-2*poisloglike.log(out.lmer)+2*attr(logLik(out.lmer),"df")
# compare to model without random effects
AIC(out.S1)